import pandas as pd
import plotly.graph_objects as go
import os
# pull each state's data into one dataframe
data_folder = 'lib/state/'
files = os.listdir(data_folder)
agg_df = pd.concat([pd.read_csv(os.path.join(data_folder, file),
header=None)
for file in files])
agg_df.columns = ['state', 'sex', 'year', 'name', 'count']
agg_df
| state | sex | year | name | count | |
|---|---|---|---|---|---|
| 0 | AK | F | 1910 | Mary | 14 |
| 1 | AK | F | 1910 | Annie | 12 |
| 2 | AK | F | 1910 | Anna | 10 |
| 3 | AK | F | 1910 | Margaret | 8 |
| 4 | AK | F | 1910 | Helen | 7 |
| ... | ... | ... | ... | ... | ... |
| 28970 | WY | M | 2022 | Lane | 5 |
| 28971 | WY | M | 2022 | Michael | 5 |
| 28972 | WY | M | 2022 | Nicholas | 5 |
| 28973 | WY | M | 2022 | River | 5 |
| 28974 | WY | M | 2022 | Silas | 5 |
6408041 rows × 5 columns
# define a function to get most common first letter for a state-year pair
def most_common_first_letter(s):
s['name'] = s['name'].str[0]
return s.groupby('name')['count'].sum().idxmax()
# create a dataframe that has the most common first letter for each state-year pair
df = agg_df.groupby(['state', 'year'])[['name', 'count']].apply(most_common_first_letter)
df = df.reset_index()
df = df.rename(columns={0 : 'firstLetter'})
df
| state | year | firstLetter | |
|---|---|---|---|
| 0 | AK | 1910 | A |
| 1 | AK | 1911 | J |
| 2 | AK | 1912 | J |
| 3 | AK | 1913 | M |
| 4 | AK | 1914 | J |
| ... | ... | ... | ... |
| 5758 | WY | 2018 | A |
| 5759 | WY | 2019 | E |
| 5760 | WY | 2020 | A |
| 5761 | WY | 2021 | E |
| 5762 | WY | 2022 | E |
5763 rows × 3 columns
# for each year and each most common letter that is represented by
# at least one state, take the most common baby name that year
# that starts with that letter
rep_names = dict()
years = df['year'].unique()
for year in years:
year_list = []
for letter in df[df['year'] == year]['firstLetter'].unique():
matched_names = agg_df[(agg_df['year'] == year) & (agg_df['name'].str[0] == letter)]
most_common_name = matched_names.groupby('name')['count'].sum().idxmax()
year_list.append(most_common_name)
rep_names[year] = year_list
fig = go.Figure(data=[go.Choropleth(locations=df[df['year']==1911]['state'], locationmode='USA-states',
z=df[df['year']==1911]['firstLetter'].apply(ord),
colorscale='Viridis',
zmin=64, zmax=90),
go.Scattergeo(locationmode='USA-states',
locations=df[df['year']==1911]['state'],
text=df[df['year']==1911]['firstLetter'],
textfont={'color' : 'White'},
mode='text')],
layout=go.Layout(width=900, height=600,
title={'text' :
'Most common first letter for baby names in each state per year',
'x' : 0.5},
geo=dict(scope='usa', projection_type='albers usa'),
updatemenus=[dict(type="buttons",
buttons=[dict(label="Play",
method="animate",
args=[None])])]),
frames=[go.Frame(
data=[go.Choropleth(locations=df[df['year']==year]['state'],
locationmode='USA-states',
z=df[df['year']==year]['firstLetter'].apply(ord),
colorscale='Viridis',
zmin=64, zmax=90),
go.Scattergeo(locationmode='USA-states',
locations=df[df['year']==year]['state'],
text=df[df['year']==year]['firstLetter'],
textfont={'color' : 'White'},
mode='text')],
layout=go.Layout(width=900, height=600,
title={'text' :
f'Most common first letter for baby names in each state in {year}',
'x' : 0.5},
geo=dict(scope='usa', projection_type='albers usa'),
annotations = [dict(
x=.3,
y=.95,
xanchor='left',
xref='paper',
yref='paper',
text='Most common name starting with these letters: ' + ', '.join(rep_names[year]),
showarrow = False
)])
) for year in years]
)
fig.show()